library(ggplot2)
library(corrplot)
## corrplot 0.84 loaded
library(leaps)
library(gridExtra)
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-13
Let’s begin by reading the census data
census_df_full = read.csv("nyc_census.csv") # read in file
set.seed(1) # reproducibility
dim(census_df_full)
## [1] 2167 36
Our data contains 2167 entries and 36 variables (or features)
names(census_df_full)
## [1] "CensusTract" "County" "Borough"
## [4] "TotalPop" "Men" "Women"
## [7] "Hispanic" "White" "Black"
## [10] "Native" "Asian" "Citizen"
## [13] "Income" "IncomeErr" "IncomePerCap"
## [16] "IncomePerCapErr" "Poverty" "ChildPoverty"
## [19] "Professional" "Service" "Office"
## [22] "Construction" "Production" "Drive"
## [25] "Carpool" "Transit" "Walk"
## [28] "OtherTransp" "WorkAtHome" "MeanCommute"
## [31] "Employed" "PrivateWork" "PublicWork"
## [34] "SelfEmployed" "FamilyWork" "Unemployment"
We will Remove columns that does not pertain to predicting median household income as well as create a second dataframe (one for Income and one for IncomePerCap).
census_df_income <- census_df_full[,-which(names(census_df_full) %in% c("Borough",
"IncomeErr",
"IncomePerCap",
"IncomePerCapErr"))]
census_df_ipc <- census_df_full[,-which(names(census_df_full) %in% c("Borough",
"Income",
"IncomeErr",
"IncomePerCapErr"))]
summary(census_df_income)
## CensusTract County TotalPop Men
## Min. :3.601e+10 Bronx :339 Min. : 0 Min. : 0
## 1st Qu.:3.605e+10 Kings :761 1st Qu.: 2360 1st Qu.: 1113
## Median :3.605e+10 New York:288 Median : 3550 Median : 1699
## Mean :3.605e+10 Queens :669 Mean : 3889 Mean : 1853
## 3rd Qu.:3.608e+10 Richmond:110 3rd Qu.: 4958 3rd Qu.: 2360
## Max. :3.609e+10 Max. :28926 Max. :13460
##
## Women Hispanic White Black
## Min. : 0 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1224 1st Qu.: 9.00 1st Qu.: 4.10 1st Qu.: 1.50
## Median : 1848 Median : 18.40 Median : 22.95 Median : 7.80
## Mean : 2036 Mean : 26.62 Mean : 33.06 Mean : 23.95
## 3rd Qu.: 2572 3rd Qu.: 39.90 3rd Qu.: 60.10 3rd Qu.: 39.08
## Max. :15466 Max. :100.00 Max. :100.00 Max. :100.00
## NA's :39 NA's :39 NA's :39
## Native Asian Citizen Income
## Min. : 0.0000 Min. : 0.00 Min. : 0 Min. : 9829
## 1st Qu.: 0.0000 1st Qu.: 2.10 1st Qu.: 1446 1st Qu.: 39073
## Median : 0.0000 Median : 6.70 Median : 2140 Median : 54505
## Mean : 0.1986 Mean :13.44 Mean : 2436 Mean : 59101
## 3rd Qu.: 0.0000 3rd Qu.:18.93 3rd Qu.: 2976 3rd Qu.: 73272
## Max. :11.3000 Max. :89.80 Max. :22905 Max. :244375
## NA's :39 NA's :39 NA's :66
## Poverty ChildPoverty Professional Service
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 9.20 1st Qu.: 8.30 1st Qu.: 23.00 1st Qu.:15.70
## Median : 16.70 Median :21.10 Median : 33.60 Median :24.55
## Mean : 19.57 Mean :24.48 Mean : 36.55 Mean :24.23
## 3rd Qu.: 26.70 3rd Qu.:37.55 3rd Qu.: 45.80 3rd Qu.:32.12
## Max. :100.00 Max. :95.70 Max. :100.00 Max. :59.80
## NA's :42 NA's :60 NA's :43 NA's :43
## Office Construction Production Drive
## Min. : 0.00 Min. : 0.000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 19.30 1st Qu.: 3.300 1st Qu.: 5.500 1st Qu.: 10.90
## Median : 23.10 Median : 6.100 Median : 8.950 Median : 20.75
## Mean : 23.43 Mean : 6.634 Mean : 9.157 Mean : 24.88
## 3rd Qu.: 26.80 3rd Qu.: 9.200 3rd Qu.: 12.600 3rd Qu.: 36.33
## Max. :100.00 Max. :43.200 Max. :100.000 Max. :100.00
## NA's :43 NA's :43 NA's :43 NA's :43
## Carpool Transit Walk OtherTransp
## Min. : 0.000 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 2.000 1st Qu.: 43.30 1st Qu.: 3.300 1st Qu.: 0.500
## Median : 4.200 Median : 57.40 Median : 6.500 Median : 1.500
## Mean : 5.088 Mean : 54.93 Mean : 9.048 Mean : 2.307
## 3rd Qu.: 7.200 3rd Qu.: 67.90 3rd Qu.: 10.800 3rd Qu.: 3.200
## Max. :26.400 Max. :100.00 Max. :100.000 Max. :55.600
## NA's :43 NA's :43 NA's :43 NA's :43
## WorkAtHome MeanCommute Employed PrivateWork
## Min. : 0.000 Min. :15.20 Min. : 0 Min. : 38.60
## 1st Qu.: 1.300 1st Qu.:37.20 1st Qu.: 1052 1st Qu.: 75.00
## Median : 2.800 Median :41.40 Median : 1579 Median : 79.90
## Mean : 3.749 Mean :40.83 Mean : 1813 Mean : 79.54
## 3rd Qu.: 5.100 3rd Qu.:45.38 3rd Qu.: 2274 3rd Qu.: 84.40
## Max. :100.000 Max. :70.50 Max. :12780 Max. :100.00
## NA's :43 NA's :61 NA's :43
## PublicWork SelfEmployed FamilyWork Unemployment
## Min. : 0.00 Min. : 0.000 Min. :0.0000 Min. : 0.000
## 1st Qu.: 8.80 1st Qu.: 3.500 1st Qu.:0.0000 1st Qu.: 5.900
## Median :13.30 Median : 5.600 Median :0.0000 Median : 8.800
## Mean :14.19 Mean : 6.151 Mean :0.1186 Mean : 9.758
## 3rd Qu.:18.70 3rd Qu.: 8.200 3rd Qu.:0.0000 3rd Qu.: 12.600
## Max. :53.70 Max. :61.400 Max. :3.9000 Max. :100.000
## NA's :43 NA's :43 NA's :43 NA's :42
As we can see in the above summary, Our Data still contain missing values. We will remove the rows where values are missing.
census_df_naomit_income <- na.omit(census_df_income)
census_df_naomit_ipc <- na.omit(census_df_ipc)
dim(census_df_naomit_income)
## [1] 2095 32
dim(census_df_naomit_ipc)
## [1] 2100 32
We will also remove the column ‘County’ because, while it might be preditictive, it will not contribute any interesting explanatory value. Additionally, we will remove ‘White’, ‘Men’, ‘PrivateWork’, ‘Professional’, and ‘Transit’ because those data are repetitive as there is as there are other sources of data that correlate with them due to the fact that the sum of all their values adds to 100%.
census_df_rd2_income <- census_df_naomit_income[,-which(names(census_df_naomit_income) %in% c("County",
"White",
"Men",
"PrivateWork",
"Professional",
"Transit"))]
census_df_rd2_ipc <- census_df_naomit_ipc[,-which(names(census_df_naomit_ipc) %in% c("County",
"White",
"Men",
"PrivateWork",
"Professional",
"Transit"))]
dim(census_df_rd2_income)
## [1] 2095 26
dim(census_df_rd2_ipc)
## [1] 2100 26
First, lets look at the correlations between our variables. (Census Track not considered for same reason as borough).
corrMat = cor(census_df_rd2_income[,-1], use = "pairwise.complete.obs")
par(mfrow = c(1,1))
corrplot(corrMat, method = "circle",title = "Median Income",mar=c(0,0,1,0))
corrMat = cor(census_df_rd2_ipc[,-1], use = "pairwise.complete.obs")
par(mfrow = c(1,1))
corrplot(corrMat, method = "circle",title = "Income Per Capita",mar=c(0,0,1,0))
TotalPopulation is highly corrlelated with Citizen (Number of Citizens). We will remove it as it is repetitive and show the remaining 24 variables for each dataframe.
census_df_rd2_income <- census_df_rd2_income[,-which(names(census_df_rd2_income) %in% c("TotalPop"))]
census_df_pred_income <- census_df_rd2_income[,-which(names(census_df_rd2_income) %in% c("CensusTract"))]
dim(census_df_pred_income)
## [1] 2095 24
names(census_df_pred_income)
## [1] "Women" "Hispanic" "Black" "Native"
## [5] "Asian" "Citizen" "Income" "Poverty"
## [9] "ChildPoverty" "Service" "Office" "Construction"
## [13] "Production" "Drive" "Carpool" "Walk"
## [17] "OtherTransp" "WorkAtHome" "MeanCommute" "Employed"
## [21] "PublicWork" "SelfEmployed" "FamilyWork" "Unemployment"
census_df_rd2_ipc <- census_df_rd2_ipc[,-which(names(census_df_rd2_ipc) %in% c("TotalPop"))]
census_df_pred_ipc <- census_df_rd2_ipc[,-which(names(census_df_rd2_ipc) %in% c("CensusTract"))]
dim(census_df_pred_ipc)
## [1] 2100 24
names(census_df_pred_ipc)
## [1] "Women" "Hispanic" "Black" "Native"
## [5] "Asian" "Citizen" "IncomePerCap" "Poverty"
## [9] "ChildPoverty" "Service" "Office" "Construction"
## [13] "Production" "Drive" "Carpool" "Walk"
## [17] "OtherTransp" "WorkAtHome" "MeanCommute" "Employed"
## [21] "PublicWork" "SelfEmployed" "FamilyWork" "Unemployment"
Now we will use Stepwise regressions to determine which features are the most predictive for each model. To determine how many features to include in each model, we will look at the number of features and some model evaluation metrics: RSS, Cp, Adjusted R^2, and BIC.
Income
set.seed(1) # reproducibility
regfit.full.income = regsubsets(Income~., data=census_df_pred_income, nvmax = 23)
reg.summary.income = summary(regfit.full.income)
par(mfrow = c(2,2))
plot(reg.summary.income$rss,xlab="Number of Variables",ylab="RSS",type="l")
plot(reg.summary.income$cp,xlab="Number of Variables",ylab="Cp",type='l')
plot(reg.summary.income$adjr2,xlab="Number of Variables",ylab="Adjusted RSq",type="l")
plot(reg.summary.income$bic,xlab="Number of Variables",ylab="BIC",type='l')
which.min(reg.summary.income$bic)
## [1] 14
As we can see, after 14 variables have been implemented for our income model, RSS and Cp does not decrease, additionally, the Adjusted R^2 values do not increase much after 14 variables are implemented, and BIC finds its minimum (optimal) value at 14 variables.
IncomePerCap
set.seed(2) # reproducibility
regfit.full.ipc = regsubsets(IncomePerCap~., data=census_df_pred_ipc, nvmax = 23)
reg.summary.ipc = summary(regfit.full.ipc)
par(mfrow = c(2,2))
plot(reg.summary.ipc$rss,xlab="Number of Variables",ylab="RSS",type="l")
plot(reg.summary.ipc$cp,xlab="Number of Variables",ylab="Cp",type='l')
plot(reg.summary.ipc$adjr2,xlab="Number of Variables",ylab="Adjusted RSq",type="l")
plot(reg.summary.ipc$bic,xlab="Number of Variables",ylab="BIC",type='l')
which.min(reg.summary.ipc$bic)
## [1] 13
By examining the plots, we can see that after about 13 features are implemented in our IncomePerCap model, the RSS and Cp do not decrease much further. Additionally, the Adjusted R^2 does not increase much beyond 13 features and the BIC finds its minimum at 13 variables implemented.
To gather further evidence for the most predictive features, we will use cross-validation to find which subsets give the smallest MSE on a test set.
Income
set.seed(1000)
predict.regsubsets = function(object, newdata, id, ...){
form = as.formula(object$call[[2]])
mat = model.matrix(form, newdata)
coefi = coef(object, id=id)
xvars = names(coefi)
mat[,xvars]%*%coefi
}
folds_income <- sample(1:10, nrow(census_df_pred_income), replace = TRUE)
cv.mse.income <- matrix(NA, nrow = 10, ncol = 23)
for(i in 1:10){
best.sub <- regsubsets(Income~., data = census_df_pred_income[folds_income!=i,], nvmax = 23)
for(j in 1:23){
pred.s <- predict(best.sub, census_df_pred_income[folds_income ==i,], id = j)
cv.mse.income[i,j] <- mean((census_df_pred_income$Income[folds_income ==i]- pred.s)^2)
}
}
avg.cv.mse.income <- apply(cv.mse.income,2,mean)
plot(avg.cv.mse.income, type = "b", main = "MSE- 10 fold MSE, Best Subsets")
v = which.min(avg.cv.mse.income)
abline(v = which.min(avg.cv.mse.income), col = "blue")
As we can see above, the minimum MSE is found at 14 variables - the exact same optimal number of values found in BIC, RSS, Cp, and Adjusted R^2 plots.
IncomePerCap
set.seed(2000)
predict.regsubsets = function(object, newdata, id, ...){
form = as.formula(object$call[[2]])
mat = model.matrix(form, newdata)
coefi = coef(object, id=id)
xvars = names(coefi)
mat[,xvars]%*%coefi
}
folds_ipc <- sample(1:10, nrow(census_df_pred_ipc), replace = TRUE)
cv.mse.ipc <- matrix(NA, nrow = 10, ncol = 23)
for(i in 1:10){
best.sub <- regsubsets(IncomePerCap~., data = census_df_pred_ipc[folds_ipc!=i,], nvmax = 23)
for(j in 1:23){
pred.s <- predict(best.sub, census_df_pred_ipc[folds_ipc ==i,], id = j)
cv.mse.ipc[i,j] <- mean((census_df_pred_ipc$IncomePerCap[folds_ipc ==i]- pred.s)^2)
}
}
avg.cv.mse.ipc <- apply(cv.mse.ipc,2,mean)
plot(avg.cv.mse.ipc, type = "b", main = "MSE- 10 fold MSE, Best Subsets")
v = which.min(avg.cv.mse.ipc)
abline(v = which.min(avg.cv.mse.ipc), col = "blue")
IncomePerCap finds it’s minimum subset at 20 variables implemented. Although this is larger than the 13 variables we saw earlier, we can see by the plot that the decreases in MSE past 13 variables are negligible. Therefore, we will proceed with our IncomePerCap model with the most predictive 13 variables.
We can now examine the most predictive variables and their coefficients.
For our Income Model
regfit.best.income = regsubsets(Income ~ ., data = census_df_pred_income, nvmax = 23)
coef(regfit.best.income, 14)
## (Intercept) Hispanic Black Citizen Poverty
## 108185.896835 67.552656 62.630255 -3.973985 -986.731531
## Service Office Construction Production Drive
## -746.465538 -737.809956 -622.012848 -757.316100 365.923942
## Walk OtherTransp WorkAtHome Employed PublicWork
## 310.108552 1223.835573 989.439797 4.693423 -256.564918
For our IncomePerCap Model
regfit.best.ipc = regsubsets(IncomePerCap ~ ., data = census_df_pred_ipc, nvmax = 23)
coef(regfit.best.ipc, 13)
## (Intercept) Asian Poverty Service Office
## 84514.0799342 -90.1087828 -615.9488241 -519.3038778 -544.3457615
## Construction Production Carpool Walk OtherTransp
## -727.3798415 -719.3725050 -235.4293857 401.2525869 1630.6422407
## WorkAtHome MeanCommute Employed PublicWork
## 536.7431216 -194.7644181 0.9142642 -295.6461710
We will start by creating a base model - that is a model with all variables included.
Income
model.base.income = lm(Income ~ ., data = census_df_pred_income)
summary(model.base.income)
##
## Call:
## lm(formula = Income ~ ., data = census_df_pred_income)
##
## Residuals:
## Min 1Q Median 3Q Max
## -44174 -8750 -926 7472 113712
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.116e+05 4.170e+03 26.774 < 2e-16 ***
## Women -8.714e-01 1.078e+00 -0.808 0.419017
## Hispanic 5.071e+01 2.457e+01 2.064 0.039147 *
## Black 5.114e+01 1.902e+01 2.688 0.007246 **
## Native 5.988e+01 4.715e+02 0.127 0.898961
## Asian -1.933e+01 2.716e+01 -0.712 0.476609
## Citizen -3.813e+00 8.950e-01 -4.261 2.13e-05 ***
## Poverty -1.048e+03 7.493e+01 -13.982 < 2e-16 ***
## ChildPoverty 4.039e+01 4.299e+01 0.939 0.347596
## Service -7.302e+02 5.385e+01 -13.559 < 2e-16 ***
## Office -7.383e+02 6.147e+01 -12.010 < 2e-16 ***
## Construction -6.147e+02 8.958e+01 -6.862 8.93e-12 ***
## Production -7.122e+02 8.738e+01 -8.151 6.20e-16 ***
## Drive 3.637e+02 3.325e+01 10.936 < 2e-16 ***
## Carpool -4.040e+01 9.677e+01 -0.417 0.676375
## Walk 2.915e+02 5.319e+01 5.480 4.76e-08 ***
## OtherTransp 1.187e+03 1.511e+02 7.850 6.62e-15 ***
## WorkAtHome 9.876e+02 1.314e+02 7.515 8.40e-14 ***
## MeanCommute -6.611e+01 7.324e+01 -0.903 0.366770
## Employed 5.383e+00 1.001e+00 5.379 8.36e-08 ***
## PublicWork -2.713e+02 7.021e+01 -3.864 0.000115 ***
## SelfEmployed -1.386e+02 1.050e+02 -1.321 0.186762
## FamilyWork 1.495e+03 8.089e+02 1.849 0.064652 .
## Unemployment 1.138e+02 8.304e+01 1.370 0.170829
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14370 on 2071 degrees of freedom
## Multiple R-squared: 0.754, Adjusted R-squared: 0.7512
## F-statistic: 275.9 on 23 and 2071 DF, p-value: < 2.2e-16
IncomePerCap
model.base.ipc = lm(IncomePerCap ~ ., data = census_df_pred_ipc)
summary(model.base.ipc)
##
## Call:
## lm(formula = IncomePerCap ~ ., data = census_df_pred_ipc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -48738 -6036 -415 5110 145925
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.449e+04 3.834e+03 22.036 < 2e-16 ***
## Women -1.585e+00 9.921e-01 -1.598 0.110193
## Hispanic -1.104e+01 2.266e+01 -0.487 0.626307
## Black 5.052e+00 1.752e+01 0.288 0.773111
## Native -1.211e+02 4.166e+02 -0.291 0.771239
## Asian -8.404e+01 2.502e+01 -3.359 0.000797 ***
## Citizen -5.388e-02 8.235e-01 -0.065 0.947837
## Poverty -6.580e+02 6.819e+01 -9.649 < 2e-16 ***
## ChildPoverty 4.918e+01 3.889e+01 1.265 0.206192
## Service -5.341e+02 4.963e+01 -10.763 < 2e-16 ***
## Office -5.695e+02 5.667e+01 -10.051 < 2e-16 ***
## Construction -7.221e+02 8.005e+01 -9.021 < 2e-16 ***
## Production -7.013e+02 8.019e+01 -8.746 < 2e-16 ***
## Drive 3.056e+01 3.064e+01 0.997 0.318728
## Carpool -2.341e+02 8.908e+01 -2.628 0.008656 **
## Walk 4.153e+02 4.889e+01 8.495 < 2e-16 ***
## OtherTransp 1.645e+03 1.368e+02 12.024 < 2e-16 ***
## WorkAtHome 6.055e+02 1.169e+02 5.178 2.45e-07 ***
## MeanCommute -2.141e+02 6.733e+01 -3.180 0.001494 **
## Employed 2.661e+00 9.216e-01 2.887 0.003928 **
## PublicWork -3.371e+02 6.461e+01 -5.217 2.00e-07 ***
## SelfEmployed -1.917e+02 9.599e+01 -1.997 0.045940 *
## FamilyWork 1.486e+03 7.456e+02 1.992 0.046460 *
## Unemployment 1.949e+02 7.643e+01 2.550 0.010844 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13260 on 2076 degrees of freedom
## Multiple R-squared: 0.7286, Adjusted R-squared: 0.7256
## F-statistic: 242.3 on 23 and 2076 DF, p-value: < 2.2e-16
Next, we will create an Income model with the features we found most predictive: Hispanic, Black, Citizen, Poverty, Service, Office, Construction, Production, Drive, Walk, OtherTransp, WorkAtHome, Employed, and PublicWork.
model.income.1 = lm(Income
~ Hispanic
+ Black
+ Citizen
+ Poverty
+ Service
+ Office
+ Construction
+ Production
+ Drive
+ Walk
+ OtherTransp
+ WorkAtHome
+ Employed
+ PublicWork
, data=census_df_pred_income)
summary(model.income.1)
##
## Call:
## lm(formula = Income ~ Hispanic + Black + Citizen + Poverty +
## Service + Office + Construction + Production + Drive + Walk +
## OtherTransp + WorkAtHome + Employed + PublicWork, data = census_df_pred_income)
##
## Residuals:
## Min 1Q Median 3Q Max
## -44504 -8797 -1025 7345 116360
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.082e+05 2.701e+03 40.059 < 2e-16 ***
## Hispanic 6.755e+01 2.103e+01 3.212 0.001339 **
## Black 6.263e+01 1.540e+01 4.067 4.94e-05 ***
## Citizen -3.974e+00 6.533e-01 -6.083 1.40e-09 ***
## Poverty -9.867e+02 4.155e+01 -23.750 < 2e-16 ***
## Service -7.465e+02 4.968e+01 -15.025 < 2e-16 ***
## Office -7.378e+02 5.978e+01 -12.342 < 2e-16 ***
## Construction -6.220e+02 8.804e+01 -7.065 2.18e-12 ***
## Production -7.573e+02 8.346e+01 -9.073 < 2e-16 ***
## Drive 3.659e+02 3.155e+01 11.599 < 2e-16 ***
## Walk 3.101e+02 4.597e+01 6.746 1.96e-11 ***
## OtherTransp 1.224e+03 1.485e+02 8.242 2.95e-16 ***
## WorkAtHome 9.894e+02 1.256e+02 7.879 5.26e-15 ***
## Employed 4.693e+00 8.786e-01 5.342 1.02e-07 ***
## PublicWork -2.566e+02 6.870e+01 -3.735 0.000193 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14370 on 2080 degrees of freedom
## Multiple R-squared: 0.7528, Adjusted R-squared: 0.7511
## F-statistic: 452.4 on 14 and 2080 DF, p-value: < 2.2e-16
Next, we will create an IncomePerCap model with the features we found most predictive: Asian, Poverty, Service, Office, Construction, Production, Carpool, Walk, OtherTransp, WorkAtHome, MeanCommute, Employed, and PublicWork.
model.ipc.1 = lm(IncomePerCap
~Asian
+ Poverty
+ Service
+ Office
+ Construction
+ Production
+ Carpool
+ Walk
+ OtherTransp
+ WorkAtHome
+ MeanCommute
+ Employed
+ PublicWork
, data=census_df_pred_ipc)
summary(model.ipc.1)
##
## Call:
## lm(formula = IncomePerCap ~ Asian + Poverty + Service + Office +
## Construction + Production + Carpool + Walk + OtherTransp +
## WorkAtHome + MeanCommute + Employed + PublicWork, data = census_df_pred_ipc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -53757 -5985 -322 5061 146391
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 84514.0799 3335.2807 25.339 < 2e-16 ***
## Asian -90.1088 20.6482 -4.364 1.34e-05 ***
## Poverty -615.9488 33.8138 -18.216 < 2e-16 ***
## Service -519.3039 41.4322 -12.534 < 2e-16 ***
## Office -544.3458 53.7503 -10.127 < 2e-16 ***
## Construction -727.3798 75.5406 -9.629 < 2e-16 ***
## Production -719.3725 75.5659 -9.520 < 2e-16 ***
## Carpool -235.4294 83.2682 -2.827 0.004738 **
## Walk 401.2526 46.0729 8.709 < 2e-16 ***
## OtherTransp 1630.6422 132.7297 12.285 < 2e-16 ***
## WorkAtHome 536.7431 112.9506 4.752 2.15e-06 ***
## MeanCommute -194.7644 64.3192 -3.028 0.002491 **
## Employed 0.9143 0.2715 3.368 0.000772 ***
## PublicWork -295.6462 54.0760 -5.467 5.12e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13290 on 2086 degrees of freedom
## Multiple R-squared: 0.7259, Adjusted R-squared: 0.7242
## F-statistic: 425 on 13 and 2086 DF, p-value: < 2.2e-16
Both of these optimized models have about the same Adjusted R^2 value as their base - 0.75 and 0.72 respectively. However, many of the variables in the base model have large p-values while all of the variables in model.1 have p-values << 0.05.
We can also evaluate these models using cross-validation to see how each model performs on a test set. We will use MSE to evaluate their errors.
set.seed(1)
k = 10
folds = cut(seq(1,nrow(census_df_pred_income)),breaks=10,labels=FALSE)
folds = folds[sample(length(folds))]
cv.errors.income = matrix(NA,k,2)
for (j in 1:k){
lm.fit.base = lm(Income ~ .,data=census_df_pred_income[folds!=j,])
lm.fit.1 = lm(Income ~ Hispanic + Black + Citizen + Poverty + Service + Office + Construction + Production + Drive + Walk + OtherTransp + WorkAtHome + Employed + PublicWork, data=census_df_pred_income[folds!=j,])
pred.base = predict(lm.fit.base,census_df_pred_income[folds==j,])
pred.1 = predict(lm.fit.1,census_df_pred_income[folds==j,])
cv.errors.income[j,1] = mean((census_df_pred_income$Income[folds==j]-pred.base)^2)
cv.errors.income[j,2] = mean((census_df_pred_income$Income[folds==j]-pred.1)^2)
}
print(cv.errors.income)
## [,1] [,2]
## [1,] 247879680 246836976
## [2,] 221620154 223088617
## [3,] 160014769 158283690
## [4,] 258397058 255427178
## [5,] 198196387 198809931
## [6,] 304679816 298684128
## [7,] 186919020 186499589
## [8,] 155710151 150963255
## [9,] 192408682 188101206
## [10,] 190297725 187816911
mean.cv.errors.income = apply(cv.errors.income,2,mean)
print(mean.cv.errors.income)
## [1] 211612344 209451148
As we can see, the Income Model with the optimal number of variable has a lower test MSE than the full model. This makes sense as the model does not overfit the training data as much.
set.seed(1)
k = 10
folds = cut(seq(1,nrow(census_df_pred_ipc)),breaks=10,labels=FALSE)
folds = folds[sample(length(folds))]
cv.errors.ipc = matrix(NA,k,2)
for (j in 1:k){
lm.fit.base = lm(IncomePerCap ~ .,data=census_df_pred_ipc[folds!=j,])
lm.fit.2 = lm(IncomePerCap~Asian + Poverty + Service + Office + Construction + Production + Carpool + Walk + OtherTransp + WorkAtHome + MeanCommute + Employed + PublicWork,
data=census_df_pred_ipc[folds!=j,])
pred.base = predict(lm.fit.base,census_df_pred_ipc[folds==j,])
pred.2 = predict(lm.fit.2,census_df_pred_ipc[folds==j,])
cv.errors.ipc[j,1] = mean((census_df_pred_ipc$Income[folds==j]-pred.base)^2)
cv.errors.ipc[j,2] = mean((census_df_pred_ipc$Income[folds==j]-pred.2)^2)
}
print(cv.errors.ipc)
## [,1] [,2]
## [1,] 146809657 147028309
## [2,] 131829521 129256616
## [3,] 173366261 174743128
## [4,] 185522940 188463304
## [5,] 178770820 181541819
## [6,] 160169039 161235644
## [7,] 238083423 234009814
## [8,] 117567234 116031224
## [9,] 217426196 212450407
## [10,] 265348193 260025982
mean.cv.errors.ipc = apply(cv.errors.ipc,2,mean)
print(mean.cv.errors.ipc)
## [1] 181489329 180478625
Again, as expected, the optimized model for IncomePerCap has a smaller MSE value because it did not overfit the training set as much.
Perhaps, we can improve upon this model. Let’s look at the plots of each variable vs. Income/IncomePerCap, it seems some might have data that are skewed.
Income
p1_income <- ggplot(census_df_pred_income, aes(Hispanic,Income)) + geom_point() + geom_smooth(method ="lm")
p2_income <- ggplot(census_df_pred_income, aes(Black,Income)) + geom_point() + geom_smooth(method ="lm")
p3_income <- ggplot(census_df_pred_income, aes(Citizen,Income)) + geom_point() + geom_smooth(method ="lm")
p4_income <- ggplot(census_df_pred_income, aes(Poverty,Income)) + geom_point() + geom_smooth(method ="lm")
p5_income <- ggplot(census_df_pred_income, aes(Service,Income)) + geom_point() + geom_smooth(method ="lm")
p6_income <- ggplot(census_df_pred_income, aes(Office,Income)) + geom_point() + geom_smooth(method ="lm")
p7_income <- ggplot(census_df_pred_income, aes(Construction,Income)) + geom_point() + geom_smooth(method ="lm")
p8_income <- ggplot(census_df_pred_income, aes(Production,Income)) + geom_point() + geom_smooth(method ="lm")
p9_income <- ggplot(census_df_pred_income, aes(Drive,Income)) + geom_point() + geom_smooth(method ="lm")
p10_income <- ggplot(census_df_pred_income, aes(Walk,Income)) + geom_point() + geom_smooth(method ="lm")
p11_income <- ggplot(census_df_pred_income, aes(OtherTransp,Income)) + geom_point() + geom_smooth(method ="lm")
p12_income <- ggplot(census_df_pred_income, aes(WorkAtHome,Income)) + geom_point() + geom_smooth(method ="lm")
p13_income <- ggplot(census_df_pred_income, aes(Employed,Income)) + geom_point() + geom_smooth(method ="lm")
p14_income <- ggplot(census_df_pred_income, aes(PublicWork,Income)) + geom_point() + geom_smooth(method ="lm")
grid.arrange(p1_income,p2_income,p3_income,p4_income,p5_income,p6_income,p7_income,p8_income,p9_income,p10_income,p11_income,p12_income,p13_income,p14_income)
IncomePerCap
p1_spc <- ggplot(census_df_pred_ipc, aes(Asian,IncomePerCap)) + geom_point() + geom_smooth(method ="lm")
p2_spc <- ggplot(census_df_pred_ipc, aes(Poverty,IncomePerCap)) + geom_point() + geom_smooth(method ="lm")
p3_spc <- ggplot(census_df_pred_ipc, aes(Service,IncomePerCap)) + geom_point() + geom_smooth(method ="lm")
p4_spc <- ggplot(census_df_pred_ipc, aes(Office,IncomePerCap)) + geom_point() + geom_smooth(method ="lm")
p5_spc <- ggplot(census_df_pred_ipc, aes(Construction,IncomePerCap)) + geom_point() + geom_smooth(method ="lm")
p6_spc <- ggplot(census_df_pred_ipc, aes(Production,IncomePerCap)) + geom_point() + geom_smooth(method ="lm")
p7_spc <- ggplot(census_df_pred_ipc, aes(Carpool,IncomePerCap)) + geom_point() + geom_smooth(method ="lm")
p8_spc <- ggplot(census_df_pred_ipc, aes(Walk,IncomePerCap)) + geom_point() + geom_smooth(method ="lm")
p9_spc <- ggplot(census_df_pred_ipc, aes(OtherTransp,IncomePerCap)) + geom_point() + geom_smooth(method ="lm")
p10_spc <- ggplot(census_df_pred_ipc, aes(WorkAtHome,IncomePerCap)) + geom_point() + geom_smooth(method ="lm")
p11_spc <- ggplot(census_df_pred_ipc, aes(MeanCommute,IncomePerCap)) + geom_point() + geom_smooth(method ="lm")
p12_spc <- ggplot(census_df_pred_ipc, aes(Employed,IncomePerCap)) + geom_point() + geom_smooth(method ="lm")
p13_spc <- ggplot(census_df_pred_ipc, aes(PublicWork,IncomePerCap)) + geom_point() + geom_smooth(method ="lm")
grid.arrange(p1_spc,p2_spc,p3_spc,p4_spc,p5_spc,p6_spc,p7_spc,p8_spc,p9_spc,p10_spc,p11_spc,p12_spc,p13_spc)
We can also look at the kernel density for each variable to get a better idea of the data’s distribution.
Income
plot(density(census_df_pred_income$Hispanic), xlab = 'Hispanic', ylab = 'Density', main = 'Kernel Density Plot')
plot(density(census_df_pred_income$Black), xlab = 'Black', ylab = 'Density', main = 'Kernel Density Plot')
plot(density(census_df_pred_income$Citizen), xlab = 'Citizen', ylab = 'Density', main = 'Kernel Density Plot')
plot(density(census_df_pred_income$Poverty), xlab = 'Poverty', ylab = 'Density', main = 'Kernel Density Plot')
plot(density(census_df_pred_income$Service), xlab = 'Service', ylab = 'Density', main = 'Kernel Density Plot')
plot(density(census_df_pred_income$Office), xlab = 'Office', ylab = 'Density', main = 'Kernel Density Plot')
plot(density(census_df_pred_income$Construction), xlab = 'Construction', ylab = 'Density', main = 'Kernel Density Plot')
plot(density(census_df_pred_income$Production), xlab = 'Production', ylab = 'Density', main = 'Kernel Density Plot')
plot(density(census_df_pred_income$Drive), xlab = 'Drive', ylab = 'Density', main = 'Kernel Density Plot')
plot(density(census_df_pred_income$Walk), xlab = 'Walk', ylab = 'Density', main = 'Kernel Density Plot')
plot(density(census_df_pred_income$OtherTransp), xlab = 'OtherTransp', ylab = 'Density', main = 'Kernel Density Plot')
plot(density(census_df_pred_income$WorkAtHome), xlab = 'WorkAtHome', ylab = 'Density', main = 'Kernel Density Plot')
plot(density(census_df_pred_income$Employed), xlab = 'Employed', ylab = 'Density', main = 'Kernel Density Plot')
plot(density(census_df_pred_income$PublicWork), xlab = 'PublicWork', ylab = 'Density', main = 'Kernel Density Plot')
IncomePerCap
plot(density(census_df_pred_ipc$Asian), xlab = 'Asian', ylab = 'Density', main = 'Kernel Density Plot')
plot(density(census_df_pred_ipc$Poverty), xlab = 'Poverty', ylab = 'Density', main = 'Kernel Density Plot')
plot(density(census_df_pred_ipc$Service), xlab = 'Service', ylab = 'Density', main = 'Kernel Density Plot')
plot(density(census_df_pred_ipc$Office), xlab = 'Office', ylab = 'Density', main = 'Kernel Density Plot')
plot(density(census_df_pred_ipc$Construction), xlab = 'Construction', ylab = 'Density', main = 'Kernel Density Plot')
plot(density(census_df_pred_ipc$Production), xlab = 'Production', ylab = 'Density', main = 'Kernel Density Plot')
plot(density(census_df_pred_ipc$Carpool), xlab = 'Carpool', ylab = 'Density', main = 'Kernel Density Plot')
plot(density(census_df_pred_ipc$Walk), xlab = 'Walk', ylab = 'Density', main = 'Kernel Density Plot')
plot(density(census_df_pred_ipc$OtherTransp), xlab = 'OtherTransp', ylab = 'Density', main = 'Kernel Density Plot')
plot(density(census_df_pred_ipc$WorkAtHome), xlab = 'WorkAtHome', ylab = 'Density', main = 'Kernel Density Plot')
plot(density(census_df_pred_ipc$MeanCommute), xlab = 'MeanCommute', ylab = 'Density', main = 'Kernel Density Plot')
plot(density(census_df_pred_ipc$Employed), xlab = 'Employed', ylab = 'Density', main = 'Kernel Density Plot')
plot(density(census_df_pred_ipc$PublicWork), xlab = 'PublicWork', ylab = 'Density', main = 'Kernel Density Plot')
After review the distributions of the variables, it was clear that some appear to be exponentially distributed. Therefore, we will log-transform these variables to make their distribution more linear. Plots of these variables vs Income/IncomePerCap are shown below both before log-transformation and after.
Note: When we log-transform, we will add constant to the data to avoid taking the log of 0.
Income
constant = 1
census_df_pred_income$logPoverty = log10(census_df_pred_income$Poverty+constant)
census_df_pred_income$logPublicWork = log10(census_df_pred_income$PublicWork+constant)
census_df_pred_income$logService = log10(census_df_pred_income$Service+constant)
census_df_pred_income$logConstruction = log10(census_df_pred_income$Construction+constant)
census_df_pred_income$logProduction = log10(census_df_pred_income$Production+constant)
census_df_pred_income$logWalk = log10(census_df_pred_income$Walk+constant)
par(mfrow = c(1,2))
plot(census_df_pred_income$Poverty, census_df_pred_income$Income, xlab = 'Poverty', ylab = 'Income')
plot(census_df_pred_income$logPoverty, census_df_pred_income$Income, xlab = 'logPoverty', ylab = 'Income')
plot(census_df_pred_income$PublicWork, census_df_pred_income$Income, xlab = 'PublicWork', ylab = 'Income')
plot(census_df_pred_income$logPublicWork, census_df_pred_income$Income, xlab = 'logPublicWork', ylab = 'Income')
plot(census_df_pred_income$Service, census_df_pred_income$Income, xlab = 'Service', ylab = 'Income')
plot(census_df_pred_income$logService, census_df_pred_income$Income, xlab = 'logService', ylab = 'Income')
plot(census_df_pred_income$Construction, census_df_pred_income$Income, xlab = 'Construction', ylab = 'Income')
plot(census_df_pred_income$logConstruction, census_df_pred_income$Income, xlab = 'logConstruction', ylab = 'Income')
plot(census_df_pred_income$Production, census_df_pred_income$Income, xlab = 'Production', ylab = 'Income')
plot(census_df_pred_income$logProduction, census_df_pred_income$Income, xlab = 'logProduction', ylab = 'Income')
plot(census_df_pred_income$Walk, census_df_pred_income$Income, xlab = 'Walk', ylab = 'Income')
plot(census_df_pred_income$logWalk, census_df_pred_income$Income, xlab = 'logWalk', ylab = 'Income')
IncomePerCap
constant = 1
census_df_pred_ipc$logPoverty = log10(census_df_pred_ipc$Poverty+constant)
census_df_pred_ipc$logService = log10(census_df_pred_ipc$Service+constant)
census_df_pred_ipc$logConstruction = log10(census_df_pred_ipc$Construction+constant)
census_df_pred_ipc$logProduction = log10(census_df_pred_ipc$Production+constant)
census_df_pred_ipc$logMeanCommute = log10(census_df_pred_ipc$MeanCommute+constant)
census_df_pred_ipc$logPublicWork = log10(census_df_pred_ipc$PublicWork+constant)
par(mfrow = c(1,2))
plot(census_df_pred_ipc$Poverty, census_df_pred_ipc$IncomePerCap, xlab = 'Poverty', ylab = 'Income Per Capital')
plot(census_df_pred_ipc$logPoverty, census_df_pred_ipc$IncomePerCap, xlab = 'Log(Poverty)', ylab = 'Income Per Capital')
plot(census_df_pred_ipc$Service, census_df_pred_ipc$IncomePerCap, xlab = 'Service', ylab = 'Income Per Capital')
plot(census_df_pred_ipc$logService, census_df_pred_ipc$IncomePerCap, xlab = 'log(Service)', ylab = 'Income Per Capital')
plot(census_df_pred_ipc$Construction, census_df_pred_ipc$IncomePerCap, xlab = 'Construction', ylab = 'Income Per Capital')
plot(census_df_pred_ipc$logConstruction, census_df_pred_ipc$IncomePerCap, xlab = 'log(Construction)', ylab = 'Income Per Capital')
plot(census_df_pred_ipc$Production, census_df_pred_ipc$IncomePerCap, xlab = 'Production', ylab = 'Income Per Capital')
plot(census_df_pred_ipc$logProduction, census_df_pred_ipc$IncomePerCap, xlab = 'log(Production)', ylab = 'Income Per Capital')
plot(census_df_pred_ipc$MeanCommute, census_df_pred_ipc$IncomePerCap, xlab = 'MeanCommute', ylab = 'Income Per Capital')
plot(census_df_pred_ipc$logMeanCommute, census_df_pred_ipc$IncomePerCap, xlab = 'log(MeanCommute)', ylab = 'Income Per Capital')
plot(census_df_pred_ipc$PublicWork, census_df_pred_ipc$IncomePerCap, xlab = 'PublicWork', ylab = 'Income Per Capital')
plot(census_df_pred_ipc$logPublicWork, census_df_pred_ipc$IncomePerCap, xlab = 'log(PublicWork)', ylab = 'Income Per Capital')
Now we can compare our previously optimized model with 14 variables: Hispanic, Black, Citizen, Poverty, Service, Office, Construction, Production, Drive, Walk, OtherTransp, WorkAtHome, Employed, and PublicWork, to the optimized model with logPoverty, logPublicWork, logService, logConstruction, logProduction, and logWalk as log-transformed variables. We will evaluate the models using the MSE of their test set.
set.seed(3)
k = 10
folds = cut(seq(1,nrow(census_df_pred_income)),breaks=10,labels=FALSE)
folds = folds[sample(length(folds))]
cv.errors.income = matrix(NA,k,2)
for (j in 1:k){
lm.fit.income.1 = lm(Income ~ Hispanic + Black + Citizen + Poverty + Service + Office + Construction + Production + Drive + Walk + OtherTransp + WorkAtHome + Employed + PublicWork, data=census_df_pred_income[folds!=j,])
lm.fit.income.2 = lm(Income ~ Hispanic + Black + Citizen + logPoverty + logService + Office + logConstruction + logProduction + Drive + logWalk + OtherTransp + WorkAtHome + Employed + logPublicWork, data=census_df_pred_income[folds!=j,])
pred.1 = predict(lm.fit.income.1,census_df_pred_income[folds==j,])
pred.2 = predict(lm.fit.income.2,census_df_pred_income[folds==j,])
cv.errors.income[j,1] = mean((census_df_pred_income$Income[folds==j]-pred.1)^2)
cv.errors.income[j,2] = mean((census_df_pred_income$Income[folds==j]-pred.2)^2)
}
print(cv.errors.income)
## [,1] [,2]
## [1,] 165803463 141292976
## [2,] 370150361 286346429
## [3,] 186249509 149243263
## [4,] 221779556 162935341
## [5,] 147475885 106894802
## [6,] 239646057 180608307
## [7,] 218514324 176781391
## [8,] 179127636 135046604
## [9,] 160662798 126969856
## [10,] 221055001 189606958
mean.cv.errors.income = apply(cv.errors.income,2,mean)
print(mean.cv.errors.income)
## [1] 211046459 165572593
Clearly, the MSE of the model with log-transformed variables is much smaller than the MSE of the model with just our 14 variables.
set.seed(4)
k = 10
folds = cut(seq(1,nrow(census_df_pred_ipc)),breaks=10,labels=FALSE)
folds = folds[sample(length(folds))]
cv.errors.ipc = matrix(NA,k,2)
for (j in 1:k){
lm.fit.3 = lm(IncomePerCap ~ Asian + Poverty + Service + Office + Construction + Production + Carpool + Walk + OtherTransp + WorkAtHome + MeanCommute + Employed + PublicWork, data=census_df_pred_ipc[folds!=j,])
lm.fit.4 = lm(IncomePerCap ~ Asian + logPoverty + logService + Office + logConstruction + logProduction + Carpool + Walk + OtherTransp + WorkAtHome + logMeanCommute + Employed + logPublicWork, data=census_df_pred_ipc[folds!=j,])
pred.3 = predict(lm.fit.3,census_df_pred_ipc[folds==j,])
pred.4 = predict(lm.fit.4,census_df_pred_ipc[folds==j,])
cv.errors.ipc[j,1] = mean((census_df_pred_ipc$IncomePerCap[folds==j]-pred.3)^2)
cv.errors.ipc[j,2] = mean((census_df_pred_ipc$IncomePerCap[folds==j]-pred.4)^2)
}
print(cv.errors.ipc)
## [,1] [,2]
## [1,] 167619474 117572774
## [2,] 138375702 103284079
## [3,] 143333240 106582157
## [4,] 285164816 215894718
## [5,] 124510063 95241703
## [6,] 279426027 190512281
## [7,] 188766048 133443393
## [8,] 157029219 133632771
## [9,] 141613222 94261977
## [10,] 172742487 133530140
mean.cv.errors.ipc = apply(cv.errors.ipc,2,mean)
print(mean.cv.errors.ipc)
## [1] 179858030 132395599
Once again, we can see by examining the cross-validated MSE of both models, the model with log-transformed variables improves the model.
We can now use our optimized subset variables and use Ridge/Lasso Regression instead of normal linear regression. We will use cross-validation to select the optimal lambda values and to calculate the MSE on a test set.
xi <- model.matrix(Income ~ Hispanic + Black + Citizen + logPoverty + logService + Office + logConstruction + logProduction + Drive + logWalk + OtherTransp + WorkAtHome + Employed + logPublicWork, census_df_pred_income)
xi <- xi[,-c(1)] # remove the intercept term
yi <- census_df_pred_income$Income
set.seed(1)
traini=sample(1:nrow(xi), nrow(xi)/2)
testi=(-traini)
yi.test=yi[testi]
grid=10^seq(10,-2,length=100) #Grid of lambda values
ridge.mod.income=glmnet(xi[traini,],yi[traini],alpha=0,lambda=grid, thresh=1e-12)
set.seed(100)
cv.out.income=cv.glmnet(xi[traini,],yi[traini],alpha=0,nfolds=10)
plot(cv.out.income)
bestlam_income=cv.out.income$lambda.min #Lambda with minimum MSE
ridge.pred.income=predict(ridge.mod.income,s=bestlam_income,newx=xi[testi,])
mean((ridge.pred.income-yi.test)^2) #Test MSE associated with smallest lambda
## [1] 157422938
out_income=glmnet(xi,yi,alpha=0)
predict(out_income,type="coefficients",s=bestlam_income) #Now get ridge coefficients for model with best lambda
## 15 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 189224.711976
## Hispanic -16.471431
## Black 8.966549
## Citizen -2.259999
## logPoverty -38538.923702
## logService -34621.859573
## Office -573.333977
## logConstruction -7435.272119
## logProduction -14263.721486
## Drive 256.210173
## logWalk -2939.692642
## OtherTransp 905.883220
## WorkAtHome 562.032707
## Employed 2.677705
## logPublicWork -9676.224297
As we can see, the ridge regression predicting Income has a smaller MSE than the previous linear model. The coefficients show that Poverty and Service are very predictive for Income (inversely so).
xipc <- model.matrix(IncomePerCap ~ Asian + logPoverty + logService + Office + logConstruction + logProduction + Carpool + Walk + OtherTransp + WorkAtHome + logMeanCommute + Employed + logPublicWork, census_df_pred_ipc)
xipc <- xipc[,-c(1)] # remove the intercept term
yipc <- census_df_pred_ipc$IncomePerCap
set.seed(1)
train_ipc=sample(1:nrow(xipc), nrow(xipc)/2)
test_ipc=(-train_ipc)
yipc.test=yipc[test_ipc]
grid=10^seq(10,-2,length=100) #Grid of lambda values
ridge.mod.ipc=glmnet(xipc[train_ipc,],yipc[train_ipc],alpha=0,lambda=grid, thresh=1e-12)
set.seed(100)
cv.out.ipc=cv.glmnet(xipc[train_ipc,],yipc[train_ipc],alpha=0,nfolds=10)
plot(cv.out.ipc)
bestlam_ipc=cv.out.ipc$lambda.min #Lambda with minimum MSE
ridge.pred.ipc=predict(ridge.mod.ipc,s=bestlam_ipc,newx=xipc[test_ipc,])
mean((ridge.pred.ipc-yipc.test)^2) #Test MSE associated with smallest lambda
## [1] 121827455
out_ipc=glmnet(xipc,yipc,alpha=0)
predict(out_ipc,type="coefficients",s=bestlam_ipc) #Now get ridge coefficients for model with best lambda
## 14 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 162431.693148
## Asian -29.311726
## logPoverty -20855.543176
## logService -29418.039357
## Office -414.307741
## logConstruction -10685.698784
## logProduction -16595.696995
## Carpool -170.236564
## Walk 116.594876
## OtherTransp 1197.254708
## WorkAtHome 222.038050
## logMeanCommute -16422.478997
## Employed 0.997845
## logPublicWork -9175.909717
Once again, the Ridge Regression does in fact have a lower MSE! Poverty and Service are again very predictive, but in this model, MeanCommute and Production are also very predictive.
Let’s test Lasso (again with optimized lambda values)!
dim(xi)
## [1] 2095 14
lasso.mod.income=glmnet(xi[traini,],yi[traini],alpha=0,lambda=grid)
plot(lasso.mod.income)
set.seed(1)
cv.out.income=cv.glmnet(xi[traini,],yi[traini],alpha=1)
plot(cv.out.income)
bestlam_income=cv.out.income$lambda.min
lasso.pred.income=predict(lasso.mod.income,s=bestlam_income,newx=xi[testi,])
mean((lasso.pred.income-yi.test)^2)
## [1] 154777322
out_income=glmnet(xi,yi,alpha=1,lambda=grid)
lasso.coef.income=predict(out_income,type="coefficients",s=bestlam_income)
lasso.coef.income
## 15 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 193118.301907
## Hispanic 25.359147
## Black 41.357423
## Citizen -4.605524
## logPoverty -40067.734160
## logService -39065.421622
## Office -583.330870
## logConstruction -8205.594509
## logProduction -14124.876147
## Drive 317.777860
## logWalk -1610.850456
## OtherTransp 885.548935
## WorkAtHome 553.225286
## Employed 5.674311
## logPublicWork -9279.266987
Lasso Regression returns about the same test MSE as Ridge Regression for this model. However, The lasso regression does not utilize all of the variables given. The only variables with weight associate in the lasso regression are: Poverty, Service, Production, OtherTransp, and WorkAtHome. This is consistent with what we found in the ridge regression - Some variables are far more predictive than others. Ultimately, either the Ridge or Lasso Regressions produce great models for predicting IncomePerCap over the optimized, transformed subset of variables.
dim(xipc)
## [1] 2100 13
lasso.mod.ipc=glmnet(xipc[train_ipc,],yipc[train_ipc],alpha=0,lambda=grid)
plot(lasso.mod.ipc)
set.seed(1)
cv.out.ipc=cv.glmnet(xipc[train_ipc,],yipc[train_ipc],alpha=1)
plot(cv.out.ipc)
bestlam_ipc=cv.out.ipc$lambda.min
lasso.pred.ipc=predict(lasso.mod.ipc,s=bestlam_ipc,newx=xipc[test_ipc,])
mean((lasso.pred.ipc-yipc.test)^2)
## [1] 120754522
out_ipc=glmnet(xipc,yipc,alpha=1,lambda=grid)
lasso.coef.ipc=predict(out_ipc,type="coefficients",s=bestlam_ipc)
lasso.coef.ipc
## 14 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 1.589693e+05
## Asian -3.184934e+01
## logPoverty -2.090588e+04
## logService -3.289470e+04
## Office -4.392563e+02
## logConstruction -1.088677e+04
## logProduction -1.675182e+04
## Carpool -1.602653e+02
## Walk 1.143845e+02
## OtherTransp 1.236576e+03
## WorkAtHome 1.050498e+02
## logMeanCommute -9.882854e+03
## Employed 8.895556e-01
## logPublicWork -1.001348e+04
In the IncomePerCap Model, we can see that lasso regresssion does improve the MSE over Ridge Regression. Thus, the lasso regession with the optimal, transformed subset variables is best for predicting IncomePerCap. It is also interesting that none of the coefficients in the lasso regression of this model are 0. This is a strong indication that all of these variables contain predictive value for IncomePerCap.